# Importing libraries
import yaml
import pymongo
import pandas as pd
from collections import Counter
# Import plot libraries
import plotly.express as px
# File function - Read dict from YAML file
def get_dict_from_yaml(yaml_path:str, encoding:str="utf-8") -> dict:
result = dict()
try:
with open(yaml_path, mode="r", encoding=encoding) as file:
yaml_file = file.read()
result = yaml.load(yaml_file, Loader=yaml.FullLoader)
except Exception as e:
print(e)
return result
def string_to_list(str_list:str) -> list:
if str_list == "[]":
return []
str_list = str_list.replace("[", "").replace("]", "").replace(" ", "").replace("\n", "")
return str_list.split(",")
# Util function - Plot column chart
def plot_bar_chart(df, x_var, y_var, title, color=None):
fig = px.bar(df, x=x_var, y=y_var, orientation='h', title=title, color=color)
fig.update_layout(yaxis={'categoryorder':'total ascending'}, showlegend=True)
fig.show()
# Read MongoDB login info
filepath = "config/mdb_setup.yaml"
mdb_login = get_dict_from_yaml(filepath)
# Create connetion to MongoDB
client = pymongo.MongoClient(mdb_login['db_server'], mdb_login['db_port'])
db = client[mdb_login['db_name']]
coll = db[mdb_login['db_collection']]
# Query data
doc_list = []
for doc in coll.find({}):
doc['claim']['nouns'] = string_to_list(doc['claim']['nouns'])
doc['claim']['entities'] = string_to_list(doc['claim']['entities'])
doc['majorClaim']['nouns'] = string_to_list(doc['majorClaim']['nouns'])
doc['majorClaim']['entities'] = string_to_list(doc['majorClaim']['entities'])
doc['premise']['nouns'] = string_to_list(doc['premise']['nouns'])
doc['premise']['entities'] = string_to_list(doc['premise']['entities'])
doc_list.append(doc)
print(">> Number of annotated documents:", len(doc_list))
>> Number of annotated documents: 4702
doc_list[0]
{'_id': ObjectId('618315e580c699e1073e9a16'),
'argumentID': '5-1',
'approach': 'A2 -> C+(L+P)',
'claim': {'text': 'Mi idea es dar un uso al arco de la victoria de Moncloa , en lugar de demolerlo , como soporte',
'nouns': ['idea',
'uso',
'arco',
'victoria',
'lugar',
'demolerlo',
'soporte'],
'entities': ['Moncloa']},
'linker': {'linker': 'para',
'category': 'CONSEQUENCE',
'subCategory': 'GOAL',
'relationType': 'support'},
'mainVerb': 'dar',
'majorClaim': {'text': 'Uso artistico y de promocion del Arco de la Victoria',
'nouns': ['Uso', 'promocion'],
'entities': ['Arco', 'laVictoria']},
'premise': {'text': 'promos de actividades en Madrid',
'nouns': ['promos', 'actividades'],
'entities': ['Madrid']},
'proposalID': 5,
'sentence': 'Mi idea es dar un uso al arco de la victoria de Moncloa, en lugar de demolerlo, como soporte para promos de actividades en Madrid (semana del orgullo gay, congresos mundiales, eventos futbol, etc)'}
# Local variables
n_top = 20
topic_filter = ""
ls_linkers = []
entities = Counter()
nouns = Counter()
main_verbs = Counter()
# Counting data
for doc in doc_list:
if doc["sentence"] == "" or topic_filter in doc["sentence"]:
# Linker data
doc_linker = doc["linker"]
ls_linkers.append([doc_linker["category"], doc_linker["subCategory"], doc_linker["linker"], doc_linker["relationType"]])
# Entities data
entities_set = set(doc["claim"]["entities"] + doc["majorClaim"]["entities"] + doc["premise"]["entities"])
for entity in entities_set:
entities[entity] += 1
# Nouns data
nouns_set = set(doc["claim"]["nouns"] + doc["majorClaim"]["nouns"] + doc["premise"]["nouns"])
for entity in nouns_set:
nouns[entity] += 1
# Verbs data
if doc["mainVerb"] != None:
main_verbs[doc["mainVerb"].lower()] += 1
# Cooking linkers dataframe
df_linkers = pd.DataFrame(ls_linkers, columns=["category", "subCategory", "linker", "relationType"])
df_linkers.head(10)
| category | subCategory | linker | relationType | |
|---|---|---|---|---|
| 0 | CONSEQUENCE | GOAL | para | support |
| 1 | CONSEQUENCE | GOAL | para | support |
| 2 | EXPLANATION | EXEMPLIFICATION | como por ejemplo | support |
| 3 | CAUSE | CONDITION | si | qualifier |
| 4 | CONSEQUENCE | GOAL | para | support |
| 5 | CONSEQUENCE | GOAL | para | support |
| 6 | CONSEQUENCE | GOAL | para | support |
| 7 | CONSEQUENCE | GOAL | para | support |
| 8 | CONSEQUENCE | GOAL | para | support |
| 9 | CAUSE | REASON | porque | support |
# Cooking dataframe
df = df_linkers.groupby(['category']).size().reset_index(name='counts')
df.rename(columns={'counts': 'frequency'}, inplace=True)
df.sort_values(by='frequency', ascending=False, inplace=True)
df
| category | frequency | |
|---|---|---|
| 1 | CONSEQUENCE | 3226 |
| 0 | CAUSE | 1189 |
| 4 | EXPLANATION | 167 |
| 2 | CONTRAST | 119 |
| 3 | ELABORATION | 1 |
# Plot top N most used categories
x_var = 'frequency'
y_var = 'category'
title = 'Top %s Most used categories' % min(n_top, len(df))
plot_bar_chart(df, x_var, y_var, title)
# Cooking dataframe
df = df_linkers.groupby(['category', 'subCategory']).size().reset_index(name='counts')
df.rename(columns={'counts': 'frequency'}, inplace=True)
df.sort_values(by='frequency', ascending=False, inplace=True)
df.head(10)
| category | subCategory | frequency | |
|---|---|---|---|
| 2 | CONSEQUENCE | GOAL | 3158 |
| 0 | CAUSE | CONDITION | 662 |
| 1 | CAUSE | REASON | 527 |
| 6 | EXPLANATION | EXEMPLIFICATION | 137 |
| 4 | CONTRAST | ALTERNATIVE | 119 |
| 3 | CONSEQUENCE | RESULT | 68 |
| 7 | EXPLANATION | RESTATEMENT | 29 |
| 5 | ELABORATION | ADDITION | 1 |
| 8 | EXPLANATION | SUMMARY | 1 |
# Plot top N most used sub-categories
x_var = 'frequency'
y_var = 'subCategory'
title = 'Top %s Most used sub-categories' % min(n_top, len(df))
color_by = 'category'
plot_bar_chart(df, x_var, y_var, title, color_by)
# Cooking dataframe
df = df_linkers.groupby(['category', 'linker']).size().reset_index(name='counts')
df.rename(columns={'counts': 'frequency'}, inplace=True)
df.sort_values(by='frequency', ascending=False, inplace=True)
df.head(10)
| category | linker | frequency | |
|---|---|---|---|
| 56 | CONSEQUENCE | para | 3115 |
| 32 | CAUSE | si | 445 |
| 35 | CAUSE | ya que | 228 |
| 29 | CAUSE | porque | 160 |
| 77 | EXPLANATION | por ejemplo | 90 |
| 3 | CAUSE | ante | 67 |
| 30 | CAUSE | pues | 49 |
| 67 | CONTRAST | ni | 42 |
| 11 | CAUSE | debido a | 40 |
| 68 | CONTRAST | o | 37 |
# Plot top N most used linkers
x_var = 'frequency'
y_var = 'linker'
title = 'Top %s Most used linkers' % min(n_top, len(df))
color_by = 'category'
plot_bar_chart(df.head(n_top), x_var, y_var, title, color_by)
# Cooking dataframe
df = df_linkers.groupby(['relationType', 'linker']).size().reset_index(name='counts')
df.rename(columns={'counts': 'frequency'}, inplace=True)
df.sort_values(by='frequency', ascending=False, inplace=True)
df.head(10)
| relationType | linker | frequency | |
|---|---|---|---|
| 59 | support | para | 3115 |
| 16 | qualifier | si | 445 |
| 73 | support | ya que | 228 |
| 68 | support | porque | 160 |
| 60 | support | por ejemplo | 90 |
| 2 | qualifier | ante | 67 |
| 69 | support | pues | 49 |
| 80 | support/attack | ni | 42 |
| 46 | support | debido a | 40 |
| 13 | qualifier | frente a | 37 |
# Plot top N most used linkers
x_var = 'frequency'
y_var = 'linker'
title = 'Top %s Most used linkers' % min(n_top, len(df))
color_by = 'relationType'
plot_bar_chart(df.head(n_top), x_var, y_var, title, color_by)
# Cooking dataframe
top_entities = entities.most_common(n_top)
df = pd.DataFrame.from_records(top_entities, columns = ['entities', 'frequency'])
df["% share"] = round(100.0 * df["frequency"] / sum(entities.values()), 2)
df.head(10)
| entities | frequency | % share | |
|---|---|---|---|
| 0 | Madrid | 538 | 8.82 |
| 1 | Ayuntamiento | 102 | 1.67 |
| 2 | EMT | 58 | 0.95 |
| 3 | Vallecas | 42 | 0.69 |
| 4 | MADRID | 40 | 0.66 |
| 5 | Retiro | 39 | 0.64 |
| 6 | Metro | 38 | 0.62 |
| 7 | Rio | 36 | 0.59 |
| 8 | AyuntamientodeMadrid | 32 | 0.52 |
| 9 | M-30 | 31 | 0.51 |
# Plot top N most used entities
x_var = 'frequency'
y_var = 'entities'
title = 'Top %s Most used entities' % min(n_top, len(df))
plot_bar_chart(df.head(n_top), x_var, y_var, title)
# Cooking dataframe
top_nouns = nouns.most_common(n_top)
df = pd.DataFrame.from_records(top_nouns, columns = ['nouns', 'frequency'])
df["% share"] = round(100.0 * df["frequency"] / sum(nouns.values()), 2)
df.head(10)
| nouns | frequency | % share | |
|---|---|---|---|
| 0 | calle | 367 | 1.36 |
| 1 | zona | 336 | 1.24 |
| 2 | calles | 259 | 0.96 |
| 3 | barrio | 258 | 0.95 |
| 4 | personas | 216 | 0.80 |
| 5 | parque | 212 | 0.78 |
| 6 | ciudad | 210 | 0.78 |
| 7 | uso | 189 | 0.70 |
| 8 | transporte | 189 | 0.70 |
| 9 | zonas | 186 | 0.69 |
# Plot top N most used nouns
x_var = 'frequency'
y_var = 'nouns'
title = 'Top %s Most used nouns' % min(n_top, len(df))
plot_bar_chart(df, x_var, y_var, title)
# Cooking dataframe
top_main_verbs = main_verbs.most_common(n_top)
df = pd.DataFrame.from_records(top_main_verbs, columns = ['main verbs', 'frequency'])
df["% share"] = round(100.0 * df["frequency"] / sum(main_verbs.values()), 2)
df.head(10)
| main verbs | frequency | % share | |
|---|---|---|---|
| 0 | propongo | 184 | 4.23 |
| 1 | crear | 171 | 3.94 |
| 2 | poner | 82 | 1.89 |
| 3 | hacer | 66 | 1.52 |
| 4 | ampliar | 62 | 1.43 |
| 5 | creo | 54 | 1.24 |
| 6 | tiene | 52 | 1.20 |
| 7 | gustaria | 48 | 1.10 |
| 8 | habilitar | 47 | 1.08 |
| 9 | tienen | 46 | 1.06 |
# Plot top N most used main verbs
x_var = 'frequency'
y_var = 'main verbs'
title = 'Top %s Most used main verbs' % min(n_top, len(df))
plot_bar_chart(df, x_var, y_var, title)